In [None]:
import marimo as mo

# MLP | Term-3 | 2025 Kaggle Assignment-1
Use the training data provided to train your model and make predictions on the test data for which the test labels will be hidden.

---

The dataset requires you to make predictions on the price of a house given a set of features. The features are listed below.

## Files
- **train.csv** - The training set which contains the features and the target
- **test.csv** - The test set for which the target column is hidden
- **sample_submission.csv** - A sample submission file in the correct format

## Column Description
- **id:** A unique identifier
- **area_type:** Each house is associated with a specific area_type
- **availability:** Is the house available to move into?
- **location:** Location of the house
- **size:** Details on Bedrooms, Halls and Kitchens
- **total_sqft:** total area of the house in square feet
- **bath:** Number of bathrooms in the house
- **balcony:** Number of balconies in the house
- **price:** Information about price of the house. This is the target variable

In [None]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import (
    LinearRegression,
    # LogisticRegression,
    Ridge,
    Lasso,
)
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import (
    # root_mean_squared_error,
    mean_squared_error,
    mean_absolute_error,
    r2_score,
)
from sklearn.preprocessing import (
    OneHotEncoder,
    StandardScaler,
    PolynomialFeatures,
)

from xgboost import XGBRegressor

In [None]:
IS_KAGGLE = "KAGGLE_KERNEL_RUN_TYPE" in os.environ

if IS_KAGGLE:
    DATA_DIR = Path("/kaggle/input/mlp-term-3-2025-kaggle-assignment-1/")
else:
    DATA_DIR = Path("datasets/")

In [None]:
trainDF = pd.read_csv(DATA_DIR / "train.csv")
testDF = pd.read_csv(DATA_DIR / "test.csv")
sampleSubDF = pd.read_csv(DATA_DIR / "sample_submission.csv")

In [None]:
trainDF.info()
trainDF.head()

In [None]:
trainDF.dropna(subset=["price"], inplace=True)

In [None]:
trainDF["location"] = trainDF["location"].fillna("Unknown")
trainDF["size"] = trainDF["size"].fillna("Unknown")
trainDF["total_sqft"] = trainDF["total_sqft"].fillna(
    trainDF["total_sqft"].median()
)
trainDF["bath"] = trainDF["bath"].fillna(trainDF["bath"].median())
trainDF["balcony"] = trainDF["balcony"].fillna(trainDF["balcony"].median())

In [None]:
trainDF["bhk"] = trainDF["size"].apply(
    lambda x: int(str(x).split()[0]) if str(x).split()[0].isdigit() else np.nan
)
trainDF["bhk"] = trainDF["bhk"].fillna(trainDF["bhk"].median())

trainDF["bath_per_bhk"] = trainDF.apply(
    lambda x: x["bath"] / x["bhk"] if x["bhk"] > 0 else 0, axis=1
)

trainDF["balcony_per_bhk"] = trainDF.apply(
    lambda x: x["balcony"] / x["bhk"] if x["bhk"] > 0 else 0, axis=1
)

trainDF["is_ready"] = trainDF["availability"].apply(
    lambda x: 1 if "Ready" in str(x) else 0
)

trainDF["availability_type"] = trainDF["availability"].apply(
    lambda x: "Ready" if "Ready" in str(x) else "Future"
)

trainDF["log_total_sqft"] = np.log1p(trainDF["total_sqft"])

In [None]:
trainDF.drop(["id", "size"], axis=1, inplace=True)

In [None]:
X = trainDF.drop("price", axis=1)
y = trainDF["price"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
categorical_cols = ["area_type", "availability", "location"]
numeric_cols = [
    "total_sqft",
    "log_total_sqft",
    "bath",
    "balcony",
    "bhk",
    "bath_per_bhk",
    "balcony_per_bhk",
    "is_ready",
]

In [None]:
numeric_transformer = Pipeline(
    [
        ("scaler", StandardScaler()),
        ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ]
)

preprocessor = ColumnTransformer(
    [
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", numeric_transformer, numeric_cols),
    ]
)

# model = LinearRegression(n_jobs=-1)
# model = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
model = GradientBoostingRegressor(random_state=42)

pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("model", model)])

In [None]:
models = {
    "LinearRegression": LinearRegression(n_jobs=-1),
    # "RandomForest": RandomForestRegressor(
    #     n_estimators=200, random_state=42, n_jobs=-1
    # ),
    "Ridge": Ridge(alpha=1.0),
    "Lasso": Lasso(alpha=0.001, max_iter=5000),
    "GradientBoosting": GradientBoostingRegressor(random_state=42),
    "XGBoost": XGBRegressor(
        random_state=42, n_estimators=300, learning_rate=0.1
    ),
}

In [None]:
for _name, _model in models.items():
    _pipe = Pipeline(steps=[("preprocessor", preprocessor), ("model", _model)])
    _scores = cross_val_score(_pipe, X, y, cv=5, scoring="r2")
    print(
        f"{_name:<20}: RÂ² mean = {_scores.mean():.3f}  |  std = {_scores.std():.3f}"
    )

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
_y_pred = pipeline.predict(X_train)
_mae = mean_absolute_error(y_train, _y_pred)
# _rmse = root_mean_squared_error(y_train, _y_pred)
_rmse = mean_squared_error(y_train, _y_pred) ** 0.5
_r2 = r2_score(y_train, _y_pred)

print("Train Data Stats:")
print(f"\tMean Absolute Error: {_mae:.2f}")
print(f"\tRoot Mean Square Error: {_rmse:.2f}")
print(f"\tR^2 Score: {_r2:.2f}")

In [None]:
_y_pred = pipeline.predict(X_test)
_mae = mean_absolute_error(y_test, _y_pred)
# _rmse = root_mean_squared_error(y_test, _y_pred)
_rmse = mean_squared_error(y_test, _y_pred) ** 0.5
_r2 = r2_score(y_test, _y_pred)

print("Test Data Stats:")
print(f"\tMean Absolute Error: {_mae:.2f}")
print(f"\tRoot Mean Square Error: {_rmse:.2f}")
print(f"\tR^2 Score: {_r2:.2f}")

In [None]:
testDF["location"] = testDF["location"].fillna("Unknown")
testDF["size"] = testDF["size"].fillna("Unknown")
testDF["total_sqft"] = testDF["total_sqft"].fillna(
    testDF["total_sqft"].median()
)
testDF["bath"] = testDF["bath"].fillna(testDF["bath"].median())
testDF["balcony"] = testDF["balcony"].fillna(testDF["balcony"].median())

In [None]:
testDF["bhk"] = testDF["size"].apply(
    lambda x: int(str(x).split()[0]) if str(x).split()[0].isdigit() else np.nan
)
testDF["bhk"] = testDF["bhk"].fillna(testDF["bhk"].median())

testDF["price_per_sqft"] = 0

testDF["bath_per_bhk"] = testDF.apply(
    lambda x: x["bath"] / x["bhk"] if x["bhk"] > 0 else 0, axis=1
)

testDF["balcony_per_bhk"] = testDF.apply(
    lambda x: x["balcony"] / x["bhk"] if x["bhk"] > 0 else 0, axis=1
)

testDF["is_ready"] = testDF["availability"].apply(
    lambda x: 1 if "Ready" in str(x) else 0
)

testDF["availability_type"] = testDF["availability"].apply(
    lambda x: "Ready" if "Ready" in str(x) else "Future"
)

testDF["log_total_sqft"] = np.log1p(testDF["total_sqft"])

In [None]:
testDF.drop(["size"], axis=1, inplace=True)

In [None]:
test_y_pred = pipeline.predict(testDF)

In [None]:
submission_df = pd.DataFrame({"id": testDF["id"], "price": test_y_pred})

In [None]:
submission_df.describe(include="all")

In [None]:
if IS_KAGGLE:
    submission_df.to_csv("submission.csv")